A Python 3.7 backend was developed to: - Pull the data from its sources - Prepare for a load into Sqlite - Commit records and report upload status
In a production environment: - Use an enterprise database like Oracle, SQL Server, MySQL, etc. - Schedule the script in Unix via /etc/crontab
#!/usr/bin/python3
import urllib.request
import pandas as pd
import io
import sqlite3
from datetime import datetime
from tqdm import tqdm
dataUrl = "http://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom.data"
labelUrl = "http://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom_labels.data"
vendorUrl = "./data/vendordata.json"
# column names for sqlite
newCols = {
"datetime": "MFG_DATE",
"mat vendor": "MAT_VENDOR",
"part vendor": "PART_VENDOR",
"sil vendor": "SIL_VENDOR",
"adhs vendor": "ADHS_VENDOR",
"sop vendor": "SOP_VENDOR",
}
def dfUpload(df, con, table, timeStamp=True, clearTable=False, debug=False):
if timeStamp:
df['INSERTED_ON'] = datetime.now()
df = df.where(pd.notnull(df), None) # convert NaN to None, for SQL Nulls
# just to fix pd.NaT to insert as NULLS
for col in df.columns:
if df[col].dtype.kind == 'M':
df[col] = df[col].astype(object).where(df[col].notnull(), None)
df[col] = df[col].dt.strftime('%Y-%m-%d %h:%m:%s')
sqlColumns = '(' + ','.join([col for col in df.columns]) + ')'
sqlValues = '(' + ','.join([':' + str(x + 1) for x in list(range(len(df.columns)))]) + ')'
sqlInsert = "INSERT INTO %s %s VALUES %s" % (table, sqlColumns, sqlValues)
crsr = con.cursor()
# uploading
if clearTable:
crsr.execute("DELETE FROM %s" % table)
for row in tqdm(df.values.tolist(), desc="Uploading data", unit="row"):
if debug:
try:
crsr.executemany(sqlInsert, [row])
except:
print(row)
pass
else:
crsr.executemany(sqlInsert, [row])
con.commit()
crsr.close()
def main():
# tried pd.read_html(), but no tables found?
def PandasFromUrl(url):
return pd.read_csv(io.BytesIO(urllib.request.urlopen(url).read()),
encoding="utf8", sep=" ", header=None)
print("Fetching data from web and formatting...")
data = PandasFromUrl(dataUrl)
data.columns = ["F" + str(i) for i in range(len(data.columns))] # prefix feature columns with "F"
data['PASS_FAIL'] = PandasFromUrl(labelUrl)[0]
vendors = pd.read_json(vendorUrl).sort_index()
df = data.merge(vendors, left_index=True, right_index=True)
df.rename(index=str, columns=newCols, inplace=True)
df['ID'] = list(range(len(df)))
print("Connecting to Sqlite...")
con = sqlite3.connect("warehouse.db")
print("Clearing table and inserting records...")
dfUpload(df, con, "SAMPLE", clearTable=True)
print("Disconnecting from Sqlite...")
con.close()
print("Done!")
if __name__ == '__main__':
main()
## Fetching data from web and formatting...
## Connecting to Sqlite...
## Clearing table and inserting records...
## Disconnecting from Sqlite...
## Done!
##
##
Uploading data: 0%| | 0/1567 [00:00<?, ?row/s]
Uploading data: 32%|###2 | 506/1567 [00:00<00:00, 5045.09row/s]
Uploading data: 69%|######8 | 1078/1567 [00:00<00:00, 4944.38row/s]
Uploading data: 100%|##########| 1567/1567 [00:00<00:00, 5094.01row/s]
Loading required libraries, clearing cache, and defining a helper function
library(DBI)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(broom)
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(extraTrees)
## Loading required package: rJava
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
rm(list = ls()) # clear all data
try(dev.off(),silent=TRUE) # clear all plots
# helper function to print huge dataframe
printWideDataFrame <- function(df, n){
head(df[c(1:n,(ncol(df)-n):ncol(df))])
}
Since the data has already been normalized into Sqlite, a SELECT statement can be used to pull the table into RAM.
In a production environment: - For ODBC/JDBC, pass connection credentials to the connection object - For REST API “GET”, call web service for response object
# connect to db, fetch table into RAM, disconnect from db
con <- dbConnect(RSQLite::SQLite(), "warehouse.db")
df_orig <- dbGetQuery(con, "select * from sample") %>%
mutate(INSERTED_ON = as.POSIXct(INSERTED_ON), MFG_DATE = as.POSIXct(MFG_DATE))
dbDisconnect(con)
# preview dataframe
printWideDataFrame(df_orig, 20)
## ID INSERTED_ON MFG_DATE PASS_FAIL MAT_VENDOR
## 1 0 2019-01-16 16:52:43 2008-07-19 11:55:00 -1 ddd
## 2 1 2019-01-16 16:52:43 2008-07-19 12:32:00 -1 eee
## 3 2 2019-01-16 16:52:43 2008-07-19 13:17:00 1 fff
## 4 3 2019-01-16 16:52:43 2008-07-19 14:43:00 -1 ccc
## 5 4 2019-01-16 16:52:43 2008-07-19 15:22:00 -1 ccc
## 6 5 2019-01-16 16:52:43 2008-07-19 17:53:00 -1 eee
## PART_VENDOR SIL_VENDOR ADHS_VENDOR SOP_VENDOR F0 F1 F2
## 1 aaa ddd bbb eee 3030.93 2564.00 2187.733
## 2 ccc ddd aaa aaa 3095.78 2465.14 2230.422
## 3 aaa eee aaa jjj 2932.61 2559.94 2186.411
## 4 ccc hhh aaa eee 2988.72 2479.90 2199.033
## 5 bbb aaa bbb iii 3032.24 2502.87 2233.367
## 6 aaa hhh bbb eee 2946.25 2432.84 2233.367
## F3 F4 F5 F6 F7 F8 F9 F10 F569
## 1 1411.1265 1.3602 100 97.6133 0.1242 1.5005 0.0162 -0.0034 NA
## 2 1463.6606 0.8294 100 102.3433 0.1247 1.4966 -0.0005 -0.0148 NA
## 3 1698.0172 1.5102 100 95.4878 0.1241 1.4436 0.0041 0.0013 68.8489
## 4 909.7926 1.3204 100 104.2367 0.1217 1.4882 -0.0124 -0.0033 25.0363
## 5 1326.5200 1.5334 100 100.3967 0.1235 1.5031 -0.0031 -0.0072 NA
## 6 1326.5200 1.5334 100 100.3967 0.1235 1.5287 0.0167 0.0055 22.5598
## F570 F571 F572 F573 F574 F575 F576 F577 F578 F579
## 1 533.8500 2.1113 8.95 0.3157 3.0624 0.1026 1.6765 14.9509 NA NA
## 2 535.0164 2.4335 5.92 0.2653 2.0111 0.0772 1.1065 10.9003 0.0096 0.0201
## 3 535.0245 2.0293 11.21 0.1882 4.0923 0.0640 2.0952 9.2721 0.0584 0.0484
## 4 530.5682 2.0253 9.33 0.1738 2.8971 0.0525 1.7585 8.5831 0.0202 0.0149
## 5 532.0155 2.0275 8.83 0.2224 3.1776 0.0706 1.6597 10.9698 NA NA
## 6 534.2091 2.3236 8.91 0.3201 2.2598 0.0899 1.6679 13.7755 0.0342 0.0151
## F580 F581 F582 F583 F584 F585 F586 F587 F588
## 1 NA NA 0.5005 0.0118 0.0035 2.3630 NA NA NA
## 2 0.0060 208.2045 0.5019 0.0223 0.0055 4.4447 0.0096 0.0201 0.0060
## 3 0.0148 82.8602 0.4958 0.0157 0.0039 3.1745 0.0584 0.0484 0.0148
## 4 0.0044 73.8432 0.4990 0.0103 0.0025 2.0544 0.0202 0.0149 0.0044
## 5 NA NA 0.4800 0.4766 0.1045 99.3032 0.0202 0.0149 0.0044
## 6 0.0052 44.0077 0.4949 0.0189 0.0044 3.8276 0.0342 0.0151 0.0052
## F589
## 1 NA
## 2 208.2045
## 3 82.8602
## 4 73.8432
## 5 73.8432
## 6 44.0077
Using dplyr commands to: - Force response variable to binary - Add dummy variables to all strings/factors - Remove columns no longer required in calculations
# massage for statistics
df_stats <- df_orig %>%
mutate(PASS_FAIL = ifelse(PASS_FAIL==1,0,1)) %>% # 1 = pass, 0 = fail
fastDummies::dummy_cols() %>% # add dummy variables for all string columns
select(-c(ID, INSERTED_ON, MFG_DATE, MAT_VENDOR, PART_VENDOR, SIL_VENDOR, ADHS_VENDOR, SOP_VENDOR)) # drop columns
# preview dataframe
printWideDataFrame(df_stats, 20)
## PASS_FAIL F0 F1 F2 F3 F4 F5 F6 F7
## 1 1 3030.93 2564.00 2187.733 1411.1265 1.3602 100 97.6133 0.1242
## 2 1 3095.78 2465.14 2230.422 1463.6606 0.8294 100 102.3433 0.1247
## 3 0 2932.61 2559.94 2186.411 1698.0172 1.5102 100 95.4878 0.1241
## 4 1 2988.72 2479.90 2199.033 909.7926 1.3204 100 104.2367 0.1217
## 5 1 3032.24 2502.87 2233.367 1326.5200 1.5334 100 100.3967 0.1235
## 6 1 2946.25 2432.84 2233.367 1326.5200 1.5334 100 100.3967 0.1235
## F8 F9 F10 F11 F12 F13 F14 F15 F16
## 1 1.5005 0.0162 -0.0034 0.9455 202.4396 0 7.9558 414.8710 10.0433
## 2 1.4966 -0.0005 -0.0148 0.9627 200.5470 0 10.1548 414.7347 9.2599
## 3 1.4436 0.0041 0.0013 0.9615 202.0179 0 9.5157 416.7075 9.3144
## 4 1.4882 -0.0124 -0.0033 0.9629 201.8482 0 9.6052 422.2894 9.6924
## 5 1.5031 -0.0031 -0.0072 0.9569 201.9424 0 10.5661 420.5925 10.3387
## 6 1.5287 0.0167 0.0055 0.9699 200.4720 0 8.6617 414.2426 9.2441
## F17 F18 SIL_VENDOR_eee SIL_VENDOR_hhh SIL_VENDOR_aaa
## 1 0.9680 192.3963 0 0 0
## 2 0.9701 191.2872 0 0 0
## 3 0.9674 192.7035 1 0 0
## 4 0.9687 192.1557 0 1 0
## 5 0.9735 191.6037 0 0 1
## 6 0.9747 191.2280 0 1 0
## SIL_VENDOR_ggg SIL_VENDOR_bbb SIL_VENDOR_ccc SIL_VENDOR_iii
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## SIL_VENDOR_fff ADHS_VENDOR_bbb ADHS_VENDOR_aaa SOP_VENDOR_eee
## 1 0 1 0 1
## 2 0 0 1 0
## 3 0 0 1 0
## 4 0 0 1 1
## 5 0 1 0 0
## 6 0 1 0 1
## SOP_VENDOR_aaa SOP_VENDOR_jjj SOP_VENDOR_iii SOP_VENDOR_ddd
## 1 0 0 0 0
## 2 1 0 0 0
## 3 0 1 0 0
## 4 0 0 0 0
## 5 0 0 1 0
## 6 0 0 0 0
## SOP_VENDOR_ccc SOP_VENDOR_kkk SOP_VENDOR_hhh SOP_VENDOR_bbb
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## SOP_VENDOR_ggg SOP_VENDOR_fff
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
Using javascript wrappers, html page can be used to show interactive volumes and yields over the given time period.
In a production environment, this would be replaced by a standalone web service, mobile app, or other internal facing customer.
# mfg volume vs time
df_orig %>%
group_by(MFG_DATE = floor_date(MFG_DATE, "day")) %>%
summarize(
QTY=n(),
QTY_PASS = sum(PASS_FAIL==1),
QTY_FAIL = sum(PASS_FAIL==0)
) %>%
plot_ly(x = ~MFG_DATE, y = ~QTY_PASS, name = "Pass", type = "bar") %>%
add_trace(y = ~QTY_FAIL, name = 'Fail') %>%
layout(
xaxis=list(title='Manufacturing Date'),
yaxis=list(title='Volume'),
barmode="stack",
title='Semi-Conductor Production Volume vs. Time')
# mfg volume vs time (cumulative)
df_orig %>%
group_by(MFG_DATE = floor_date(MFG_DATE, "day")) %>%
summarize(
QTY = n(),
QTY_PASS = sum(PASS_FAIL==1),
QTY_FAIL = sum(PASS_FAIL==0)
) %>%
mutate(
QTY = cumsum(QTY),
QTY_PASS = cumsum(QTY_PASS),
QTY_FAIL = cumsum(QTY_FAIL),
PERC_PASS = QTY_PASS/QTY
) %>%
plot_ly() %>%
add_trace(x = ~MFG_DATE, y = ~QTY_PASS, name = "Pass", type = "bar", yaxis = 'y1') %>%
add_trace(x = ~MFG_DATE, y = ~QTY_FAIL, name = 'Fail', type = "bar", yaxis = 'y1') %>%
add_trace(x = ~MFG_DATE, y = ~PERC_PASS, type = 'scatter', mode = 'lines', name = 'Yield %', yaxis = 'y2') %>%
layout(
xaxis=list(title='Manufacturing Date'),
yaxis=list(side='left',title='Volume (Cumulative)',showgrid=FALSE,zeroline=FALSE),
yaxis2=list(side='right',overlaying="y",title='Yield %',showgrid=FALSE,zeroline=FALSE,tickformat="%"),
barmode="stack",
title='Semi-Conductor Production Volume vs. Time (Cumulative)'
)
Starting with a simple approach due to binary response variable.
Filling NAs with column medians.
Stepping through to find lowest AIC.
# create copy
df1 <- df_stats
# impute NAs in dataframe with column medians
for(col in names(df1)) {
# feature columns only (they start with "F" and the other digits are numeric)
if((substring(col,1,1) == "F") && !is.na(as.numeric(substring(col,2)))) {
df1[is.na(df1[,col]), col] <- median(df1[,col], na.rm = TRUE)
}
}
# preview dataframe
printWideDataFrame(df1, 20)
## PASS_FAIL F0 F1 F2 F3 F4 F5 F6 F7
## 1 1 3030.93 2564.00 2187.733 1411.1265 1.3602 100 97.6133 0.1242
## 2 1 3095.78 2465.14 2230.422 1463.6606 0.8294 100 102.3433 0.1247
## 3 0 2932.61 2559.94 2186.411 1698.0172 1.5102 100 95.4878 0.1241
## 4 1 2988.72 2479.90 2199.033 909.7926 1.3204 100 104.2367 0.1217
## 5 1 3032.24 2502.87 2233.367 1326.5200 1.5334 100 100.3967 0.1235
## 6 1 2946.25 2432.84 2233.367 1326.5200 1.5334 100 100.3967 0.1235
## F8 F9 F10 F11 F12 F13 F14 F15 F16
## 1 1.5005 0.0162 -0.0034 0.9455 202.4396 0 7.9558 414.8710 10.0433
## 2 1.4966 -0.0005 -0.0148 0.9627 200.5470 0 10.1548 414.7347 9.2599
## 3 1.4436 0.0041 0.0013 0.9615 202.0179 0 9.5157 416.7075 9.3144
## 4 1.4882 -0.0124 -0.0033 0.9629 201.8482 0 9.6052 422.2894 9.6924
## 5 1.5031 -0.0031 -0.0072 0.9569 201.9424 0 10.5661 420.5925 10.3387
## 6 1.5287 0.0167 0.0055 0.9699 200.4720 0 8.6617 414.2426 9.2441
## F17 F18 SIL_VENDOR_eee SIL_VENDOR_hhh SIL_VENDOR_aaa
## 1 0.9680 192.3963 0 0 0
## 2 0.9701 191.2872 0 0 0
## 3 0.9674 192.7035 1 0 0
## 4 0.9687 192.1557 0 1 0
## 5 0.9735 191.6037 0 0 1
## 6 0.9747 191.2280 0 1 0
## SIL_VENDOR_ggg SIL_VENDOR_bbb SIL_VENDOR_ccc SIL_VENDOR_iii
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## SIL_VENDOR_fff ADHS_VENDOR_bbb ADHS_VENDOR_aaa SOP_VENDOR_eee
## 1 0 1 0 1
## 2 0 0 1 0
## 3 0 0 1 0
## 4 0 0 1 1
## 5 0 1 0 0
## 6 0 1 0 1
## SOP_VENDOR_aaa SOP_VENDOR_jjj SOP_VENDOR_iii SOP_VENDOR_ddd
## 1 0 0 0 0
## 2 1 0 0 0
## 3 0 1 0 0
## 4 0 0 0 0
## 5 0 0 1 0
## 6 0 0 0 0
## SOP_VENDOR_ccc SOP_VENDOR_kkk SOP_VENDOR_hhh SOP_VENDOR_bbb
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## SOP_VENDOR_ggg SOP_VENDOR_fff
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
# # initialize models
# m1_full = glm(PASS_FAIL ~ ., data=df1, family=binomial(), control = list(maxit = 50))
# m1_null = glm(PASS_FAIL ~ 1, data=df1, family=binomial(), control = list(maxit = 50))
#
# # down-select variables
# # m1_bwd = step(m1_full, direction="backward"), backward not a good choice for high dimensionality problem
# m1_fwd = step(m1_null, scope=list(lower=m1_null, upper=m1_full), direction="forward")
# m1_both = step(m1_null, scope = list(upper=m1_full), direction="both")
#
# # compare methods
# if(m1_fwd$aic<m1_both$aic){
# print("Step forward selection chosen")
# m1_varModel = m1_fwd
# }else{
# print("Step both selection chosen")
# m1_varModel = m1_both
# }
# m1_formula <- m1_varModel$formula
# m1_formula
# # BOTH SELECTED, TODO
m1_formula <- as.formula(
"PASS_FAIL ~ SIL_VENDOR_eee + F103 + F59 + F21 + F73 + F428 +
F569 + F64 + F75 + F129 + F433 + F365 + F9 + F443 + F473 +
F500 + F368 + F488 + SOP_VENDOR_ggg + F411 + F476 + F38 +
F87 + F104 + F484 + F349 + F84 + F72 + F56 + F554 + F131 +
F511 + F545 + F470 + F410 + F419 + F418 + F32 + SIL_VENDOR_ccc +
SOP_VENDOR_aaa + F320 + F66 + F321 + F94 + F132 + F575"
)
m1_base = glm(m1_formula, data=df1, family=binomial(), control = list(maxit = 50))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(m1_base)
##
## Call:
## glm(formula = m1_formula, family = binomial(), data = df1, control = list(maxit = 50))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -8.49 0.00 0.00 0.00 8.49
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.118e+17 9.574e+08 116800273 <2e-16 ***
## SIL_VENDOR_eee -1.860e+15 4.823e+06 -385570582 <2e-16 ***
## F103 -6.645e+15 6.460e+08 -10285437 <2e-16 ***
## F59 -5.886e+13 2.506e+05 -234897325 <2e-16 ***
## F21 -3.362e+11 2.914e+03 -115374448 <2e-16 ***
## F73 2.029e+13 3.648e+05 55628443 <2e-16 ***
## F428 1.586e+13 1.654e+05 95899494 <2e-16 ***
## F569 -2.292e+13 1.853e+05 -123660497 <2e-16 ***
## F64 -7.830e+13 4.308e+05 -181743602 <2e-16 ***
## F75 -1.049e+16 8.162e+07 -128475389 <2e-16 ***
## F129 -1.745e+14 1.553e+06 -112342200 <2e-16 ***
## F433 -5.630e+11 7.974e+03 -70601180 <2e-16 ***
## F365 -8.186e+16 8.225e+08 -99530315 <2e-16 ***
## F9 1.254e+16 1.150e+08 109020153 <2e-16 ***
## F443 -1.769e+15 1.240e+07 -142638391 <2e-16 ***
## F473 -6.100e+12 8.710e+04 -70034373 <2e-16 ***
## F500 -4.088e+11 5.341e+03 -76547229 <2e-16 ***
## F368 1.267e+17 1.051e+09 120583482 <2e-16 ***
## F488 6.391e+11 6.916e+03 92421264 <2e-16 ***
## SOP_VENDOR_ggg 1.388e+14 6.077e+06 22839541 <2e-16 ***
## F411 -1.353e+14 3.194e+06 -42363755 <2e-16 ***
## F476 2.204e+12 1.388e+05 15876370 <2e-16 ***
## F38 1.098e+14 4.245e+06 25853822 <2e-16 ***
## F87 -7.567e+15 1.347e+08 -56161019 <2e-16 ***
## F104 1.794e+17 2.053e+09 87379983 <2e-16 ***
## F484 3.712e+11 8.278e+03 44844919 <2e-16 ***
## F349 -9.179e+15 1.631e+08 -56282408 <2e-16 ***
## F84 2.020e+16 3.411e+08 59217406 <2e-16 ***
## F72 -6.039e+12 3.585e+05 -16846010 <2e-16 ***
## F56 -4.287e+16 2.950e+08 -145330525 <2e-16 ***
## F554 -2.064e+14 3.163e+06 -65255420 <2e-16 ***
## F131 -6.563e+16 7.862e+08 -83486519 <2e-16 ***
## F511 -2.169e+11 5.224e+03 -41515405 <2e-16 ***
## F545 5.685e+13 1.397e+06 40683253 <2e-16 ***
## F470 5.863e+13 4.735e+05 123817519 <2e-16 ***
## F410 2.752e+13 8.189e+05 33607936 <2e-16 ***
## F419 -1.676e+11 5.271e+03 -31803275 <2e-16 ***
## F418 1.247e+11 5.980e+03 20843840 <2e-16 ***
## F32 -8.543e+13 8.762e+05 -97495392 <2e-16 ***
## SIL_VENDOR_ccc -8.954e+14 5.923e+06 -151161347 <2e-16 ***
## SOP_VENDOR_aaa -1.246e+14 5.819e+06 -21415935 <2e-16 ***
## F320 -3.402e+15 7.863e+07 -43265617 <2e-16 ***
## F66 -1.821e+13 2.287e+05 -79618705 <2e-16 ***
## F321 7.536e+13 1.190e+06 63318441 <2e-16 ***
## F94 7.806e+17 1.007e+10 77480257 <2e-16 ***
## F132 1.846e+15 3.585e+07 51494879 <2e-16 ***
## F575 9.544e+14 2.577e+07 37037828 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 765.15 on 1566 degrees of freedom
## Residual deviance: 5406.55 on 1520 degrees of freedom
## AIC: 5500.5
##
## Number of Fisher Scoring iterations: 46
Handles NAs gracefully Handles noisey data and high dimensionality
# create copy
df2 <- df_stats
# preview dataframe
printWideDataFrame(df2, 20)
## PASS_FAIL F0 F1 F2 F3 F4 F5 F6 F7
## 1 1 3030.93 2564.00 2187.733 1411.1265 1.3602 100 97.6133 0.1242
## 2 1 3095.78 2465.14 2230.422 1463.6606 0.8294 100 102.3433 0.1247
## 3 0 2932.61 2559.94 2186.411 1698.0172 1.5102 100 95.4878 0.1241
## 4 1 2988.72 2479.90 2199.033 909.7926 1.3204 100 104.2367 0.1217
## 5 1 3032.24 2502.87 2233.367 1326.5200 1.5334 100 100.3967 0.1235
## 6 1 2946.25 2432.84 2233.367 1326.5200 1.5334 100 100.3967 0.1235
## F8 F9 F10 F11 F12 F13 F14 F15 F16
## 1 1.5005 0.0162 -0.0034 0.9455 202.4396 0 7.9558 414.8710 10.0433
## 2 1.4966 -0.0005 -0.0148 0.9627 200.5470 0 10.1548 414.7347 9.2599
## 3 1.4436 0.0041 0.0013 0.9615 202.0179 0 9.5157 416.7075 9.3144
## 4 1.4882 -0.0124 -0.0033 0.9629 201.8482 0 9.6052 422.2894 9.6924
## 5 1.5031 -0.0031 -0.0072 0.9569 201.9424 0 10.5661 420.5925 10.3387
## 6 1.5287 0.0167 0.0055 0.9699 200.4720 0 8.6617 414.2426 9.2441
## F17 F18 SIL_VENDOR_eee SIL_VENDOR_hhh SIL_VENDOR_aaa
## 1 0.9680 192.3963 0 0 0
## 2 0.9701 191.2872 0 0 0
## 3 0.9674 192.7035 1 0 0
## 4 0.9687 192.1557 0 1 0
## 5 0.9735 191.6037 0 0 1
## 6 0.9747 191.2280 0 1 0
## SIL_VENDOR_ggg SIL_VENDOR_bbb SIL_VENDOR_ccc SIL_VENDOR_iii
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## SIL_VENDOR_fff ADHS_VENDOR_bbb ADHS_VENDOR_aaa SOP_VENDOR_eee
## 1 0 1 0 1
## 2 0 0 1 0
## 3 0 0 1 0
## 4 0 0 1 1
## 5 0 1 0 0
## 6 0 1 0 1
## SOP_VENDOR_aaa SOP_VENDOR_jjj SOP_VENDOR_iii SOP_VENDOR_ddd
## 1 0 0 0 0
## 2 1 0 0 0
## 3 0 1 0 0
## 4 0 0 0 0
## 5 0 0 1 0
## 6 0 0 0 0
## SOP_VENDOR_ccc SOP_VENDOR_kkk SOP_VENDOR_hhh SOP_VENDOR_bbb
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## SOP_VENDOR_ggg SOP_VENDOR_fff
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
# run model and summarize
m2_base = extraTrees(df2 %>% select(-PASS_FAIL),df2$PASS_FAIL,numRandomCuts=1,na.action="fuse")
m2_base
## ExtraTrees:
## - # of trees: 500
## - node size: 5
## - # of dim: 622
## - # of tries: 207
## - type: numeric (regression)
## - multi-task: no
# extraTrees does not have a variable importance function
Run 5 fold cross validation for both models, generate all ROC graphs, and export to pdf.
# create copy
df <- df_stats
n = 5 # number of folds
df = df[sample(nrow(df)),] # Randomly shuffle the data
folds = cut(seq(1,nrow(df)),breaks=n,labels=FALSE) # Create 5 equally size folds
# create empty matrix for accuracy and precision
accuracy = matrix(data=NA,nrow=n,ncol=2)
precision = matrix(data=NA,nrow=n,ncol=2)
cutoff = 0.50
pdf(file='./docs/Rplots.pdf',width=10,height=7.5) # begin pdf writer
# Perform 5 fold cross validation
for(i in 1:n){
# Segment the data by fold using the which() function
testIndexes = which(folds==i,arr.ind=TRUE)
testData = df[testIndexes, ]
trainData = df[-testIndexes, ]
# model 1: logistic regression
m1 = glm(m1_formula,data=trainData,family='binomial',control=list(maxit=50))
p1 = predict(m1,newdata=testData,type='response')
pr1 = prediction(p1,testData$PASS_FAIL)
prf1 = performance(pr1,measure="tpr",x.measure="fpr")
prec1 = performance(pr1,measure="prec")
acc1 = performance(pr1,measure="acc")
auc1 = performance(pr1,measure="auc")
# model 2: extremely random forest
m2 = extraTrees(trainData %>% select(-PASS_FAIL),trainData$PASS_FAIL,numRandomCuts=1,na.action="fuse")
p2 = predict(m2,testData %>% select(-PASS_FAIL))
pr2 = prediction(p2,testData$PASS_FAIL)
prf2 = performance(pr2,measure="tpr",x.measure="fpr")
prec2 = performance(pr2,measure="prec")
acc2 = performance(pr2,measure="acc")
auc2 = performance(pr2,measure="auc")
# graph results
par(pty="s")
plot(prf1,main=paste('ROC: Fold ',i,sep=''),xaxs='i',yaxs='i',asp=1)
lines(prf2@x.values[[1]],prf2@y.values[[1]],col='red')
abline(a=0,b=1,lty=2)
legend('bottomright',
c(paste('Model 1 | AUC=',format(round(auc1@y.values[[1]],3),3),sep=''),
paste('Model 2 | AUC=',format(round(auc2@y.values[[1]],3),3),sep='')),
col=c('black','red'),lty=c(1,1))
par(pty="m")
plot(prec1,main=paste('Precision: Fold ',i,sep=''),ylim=c(0.4,1))
lines(prec2@x.values[[1]],prec2@y.values[[1]],col='red')
abline(v=0.5,lty=2)
legend('topleft',c('Model 1','Model 2'),col=c('black','red'),lty=c(1,1))
plot(acc1,main=paste('Accuracy: Fold ',i,sep=''),ylim=c(0.4,1))
lines(acc2@x.values[[1]],acc2@y.values[[1]],col='red')
abline(v=0.5,lty=2)
legend('topleft',c('Model 1','Model 2'),col=c('black','red'),lty=c(1,1))
accuracy[i,1] = acc1@y.values[[1]][max(which(acc1@x.values[[1]]>=cutoff))]
accuracy[i,2] = acc2@y.values[[1]][max(which(acc2@x.values[[1]]>=cutoff))]
precision[i,1] = prec1@y.values[[1]][max(which(prec1@x.values[[1]]>=cutoff))]
precision[i,2] = prec2@y.values[[1]][max(which(prec2@x.values[[1]]>=cutoff))]
}
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
dev.off() # close pdf writer
## png
## 2
# defined as null hypothesis: m1-m2=0
accuracy_test = t.test(accuracy[,1],accuracy[,2],conf.level=0.95,paired=T)
precision_test = t.test(precision[,1],precision[,2],conf.level=0.95,paired=T)
accuracy
## [,1] [,2]
## [1,] 0.9340659 0.9570957
## [2,] 0.9454545 0.9246862
## [3,] 0.9456522 0.9592593
## [4,] 0.8532110 0.9261993
## [5,] 0.9506173 0.9396226
accuracy_test
##
## Paired t-test
##
## data: accuracy[, 1] and accuracy[, 2]
## t = -0.94925, df = 4, p-value = 0.3962
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.06112014 0.02997530
## sample estimates:
## mean of the differences
## -0.01557242
if(accuracy_test$p.value>0.05){
print("Model 1 and Model 2 accuracies are not significantly different.")
}else if(mean(accuracy[,1])>mean(accuracy[,2])){
print("Model 1 is statistically more accurate than Model 2.")
}else{
print("Model 2 is statistically more accurate than Model 1.")
}
## [1] "Model 1 and Model 2 accuracies are not significantly different."
precision
## [,1] [,2]
## [1,] 0.9647059 0.9721254
## [2,] 0.9702970 0.9502262
## [3,] 1.0000000 0.9726562
## [4,] 0.9462366 0.9444444
## [5,] 0.9615385 0.9600000
precision_test
##
## Paired t-test
##
## data: precision[, 1] and precision[, 2]
## t = 1.3405, df = 4, p-value = 0.2512
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.00928246 0.02661268
## sample estimates:
## mean of the differences
## 0.008665112
if(precision_test$p.value>0.05){
print("Model 1 and Model 2 precisions are not significantly different.")
}else if(mean(precision[,1])>mean(precision[,2])){
print("Model 1 is statistically more precise than Model 2.")
}else{
print("Model 2 is statistically more precise than Model 1.")
}
## [1] "Model 1 and Model 2 precisions are not significantly different."
The time series graphs of all the important variables can be found here ./docs/ImportantVariables.pdf
pdf(file='./docs/ImportantVariables.pdf',width=10,height=7.5) # begin pdf writer
for(name in sort(names(m1_base$coefficients)[-1])){
p <- df_orig %>%
fastDummies::dummy_cols() %>% # add dummy variables for all string columns
mutate(PASS_FAIL = as.factor(ifelse(PASS_FAIL==1,"PASS","FAIL"))) %>%
ggplot(aes_string(x = "MFG_DATE", y = name)) +
geom_point(alpha = 0.4, aes(colour = PASS_FAIL)) +
scale_color_manual(values = c("FAIL" = "red", "PASS" = "black")) +
guides(colour = guide_legend(override.aes = list(alpha = 1, size = 2))) +
ggtitle(paste("Variable:",name,"vs. Time")) +
xlab("Manufacturing Date") +
ylab("Value")
print(p)
}
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 9 rows containing missing values (geom_point).
## Warning: Removed 9 rows containing missing values (geom_point).
## Warning: Removed 8 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 24 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 6 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 7 rows containing missing values (geom_point).
## Warning: Removed 14 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 10 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 6 rows containing missing values (geom_point).
## Warning: Removed 7 rows containing missing values (geom_point).
## Warning: Removed 6 rows containing missing values (geom_point).
## Warning: Removed 24 rows containing missing values (geom_point).
## Warning: Removed 24 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 260 rows containing missing values (geom_point).
## Warning: Removed 4 rows containing missing values (geom_point).
## Warning: Removed 273 rows containing missing values (geom_point).
## Warning: Removed 7 rows containing missing values (geom_point).
## Warning: Removed 7 rows containing missing values (geom_point).
## Warning: Removed 6 rows containing missing values (geom_point).
## Warning: Removed 794 rows containing missing values (geom_point).
## Warning: Removed 794 rows containing missing values (geom_point).
## Warning: Removed 24 rows containing missing values (geom_point).
## Warning: Removed 12 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 6 rows containing missing values (geom_point).
dev.off()
## png
## 2